Reinforcement learning problems involve learning what to do — how to map situations to actions — so as to maximize a numerical reword signal. In an essential way these are closed-loop problems because the learning system’s actions influence its later inputs.
subelements:
A policy is a mapping from perceived states of the environment to actions to be taken when in those states.
The policy is the core of a reinforcement learning agent in the sense that it alone is sufficient to determine behavior. In general, policies may be stochastic.
A reward signal defines the goal in a reinforcement learning problem.
On each time step, the environment sends to the reinforcement learning agent a single number, a reward. The agent’s sole objective is to maximize the total reward it receives over the long run.
A value function specifies what is good in the long run.
the value of a state is the total amount of reward an agent can expect to accumulate over the future, starting from that state.
A model of environment allows inferences to be made about how the environment will behave. Methods for solving reinforcement learning problems that use models and planning are called model-based methods, as opposed to simpler model-free methods that are explicitly trial-and-error learners—viewed as almost the opposite of planning.
evolutionary methods (Genetic Programming..) have advantages on problems in which the learning agent cannot accurately sense the state of its environment.
Reinforcement learning is a computational approach to understanding and automat- ing goal-directed learning and decision-making. It is distinguished from other com- putational approaches by its emphasis on learning by an agent from direct interaction with its environment, without relying on exemplary supervision or complete models of the environment.
In [11]:
import numpy as np
import pickle
BOARD_ROWS = 3
BOARD_COLS = 3
BOARD_SIZE = BOARD_ROWS * BOARD_COLS
class State:
def __init__(self):
# the board is represented by a n * n array,
# 1 represents chessman of the player who moves first,
# -1 represents chessman of another player
# 0 represents empty position
self.data = np.zeros((BOARD_ROWS, BOARD_COLS))
self.winner = None
self.hashVal = None
self.end = None
# calculate the hash value for one state, it's unique
def getHash(self):
if self.hashVal is None:
self.hashVal = 0
for i in self.data.reshape(BOARD_ROWS * BOARD_COLS):
if i == -1:
i = 2
self.hashVal = self.hashVal * 3 + i
return int(self.hashVal)
# determine whether a player has won the game, or it's a tie
def isEnd(self):
if self.end is not None:
return self.end
results = []
# check row
for i in range(0, BOARD_ROWS):
results.append(np.sum(self.data[i, :]))
# check columns
for i in range(0, BOARD_COLS):
results.append(np.sum(self.data[:, i]))
# check diagonals
results.append(0)
for i in range(0, BOARD_ROWS):
results[-1] += self.data[i, i]
results.append(0)
for i in range(0, BOARD_ROWS):
results[-1] += self.data[i, BOARD_ROWS - 1 - i]
for result in results:
if result == 3:
self.winner = 1
self.end = True
return self.end
if result == -3:
self.winner = -1
self.end = True
return self.end
# whether it's a tie
sum = np.sum(np.abs(self.data))
if sum == BOARD_ROWS * BOARD_COLS:
self.winner = 0
self.end = True
return self.end
# game is still going on
self.end = False
return self.end
# @symbol 1 or -1
# put chessman symbol in position (i, j)
def nextState(self, i, j, symbol):
newState = State()
newState.data = np.copy(self.data)
newState.data[i, j] = symbol
return newState
# print the board
def show(self):
for i in range(0, BOARD_ROWS):
print('-------------')
out = '| '
for j in range(0, BOARD_COLS):
if self.data[i, j] == 1:
token = '*'
if self.data[i, j] == 0:
token = '0'
if self.data[i, j] == -1:
token = 'x'
out += token + ' | '
print(out)
print('-------------')
def getAllStatesImpl(currentState, currentSymbol, allStates):
for i in range(0, BOARD_ROWS):
for j in range(0, BOARD_COLS):
if currentState.data[i][j] == 0:
newState = currentState.nextState(i, j, currentSymbol)
newHash = newState.getHash()
if newHash not in allStates.keys():
isEnd = newState.isEnd()
allStates[newHash] = (newState, isEnd)
if not isEnd:
getAllStatesImpl(newState, -currentSymbol, allStates)
def getAllStates():
currentSymbol = 1
currentState = State()
allStates = dict()
allStates[currentState.getHash()] = (currentState, currentState.isEnd())
getAllStatesImpl(currentState, currentSymbol, allStates)
return allStates
# all possible board configurations
allStates = getAllStates()
class Judger:
# @player1: player who will move first, its chessman will be 1
# @player2: another player with chessman -1
# @feedback: if True, both players will receive rewards when game is end
def __init__(self, player1, player2, feedback=True):
self.p1 = player1
self.p2 = player2
self.feedback = feedback
self.currentPlayer = None
self.p1Symbol = 1
self.p2Symbol = -1
self.p1.setSymbol(self.p1Symbol)
self.p2.setSymbol(self.p2Symbol)
self.currentState = State()
self.allStates = allStates
# give reward to two players
def giveReward(self):
if self.currentState.winner == self.p1Symbol:
self.p1.feedReward(1)
self.p2.feedReward(0)
elif self.currentState.winner == self.p2Symbol:
self.p1.feedReward(0)
self.p2.feedReward(1)
else:
self.p1.feedReward(0.1)
self.p2.feedReward(0.5)
def feedCurrentState(self):
self.p1.feedState(self.currentState)
self.p2.feedState(self.currentState)
def reset(self):
self.p1.reset()
self.p2.reset()
self.currentState = State()
self.currentPlayer = None
# @show: if True, print each board during the game
def play(self, show=False):
self.reset()
self.feedCurrentState()
while True:
# set current player
if self.currentPlayer == self.p1:
self.currentPlayer = self.p2
else:
self.currentPlayer = self.p1
if show:
self.currentState.show()
[i, j, symbol] = self.currentPlayer.takeAction()
self.currentState = self.currentState.nextState(i, j, symbol)
hashValue = self.currentState.getHash()
self.currentState, isEnd = self.allStates[hashValue]
self.feedCurrentState()
if isEnd:
if self.feedback:
self.giveReward()
return self.currentState.winner
# AI player
class Player:
# @stepSize: step size to update estimations
# @exploreRate: possibility to explore
def __init__(self, stepSize = 0.1, exploreRate=0.1):
self.allStates = allStates
self.estimations = dict()
self.stepSize = stepSize
self.exploreRate = exploreRate
self.states = []
def reset(self):
self.states = []
def setSymbol(self, symbol):
self.symbol = symbol
for hash in self.allStates.keys():
(state, isEnd) = self.allStates[hash]
if isEnd:
if state.winner == self.symbol:
self.estimations[hash] = 1.0
else:
self.estimations[hash] = 0
else:
self.estimations[hash] = 0.5
# accept a state
def feedState(self, state):
self.states.append(state)
# update estimation according to reward
def feedReward(self, reward):
if len(self.states) == 0:
return
self.states = [state.getHash() for state in self.states]
target = reward
for latestState in reversed(self.states):
value = self.estimations[latestState] + self.stepSize * (target - self.estimations[latestState])
self.estimations[latestState] = value
target = value
self.states = []
# determine next action
def takeAction(self):
state = self.states[-1]
nextStates = []
nextPositions = []
for i in range(BOARD_ROWS):
for j in range(BOARD_COLS):
if state.data[i, j] == 0:
nextPositions.append([i, j])
nextStates.append(state.nextState(i, j, self.symbol).getHash())
if np.random.binomial(1, self.exploreRate):
np.random.shuffle(nextPositions)
# Not sure if truncating is the best way to deal with exploratory step
# Maybe it's better to only skip this step rather than forget all the history
self.states = []
action = nextPositions[0]
action.append(self.symbol)
return action
values = []
for hash, pos in zip(nextStates, nextPositions):
values.append((self.estimations[hash], pos))
np.random.shuffle(values)
values.sort(key=lambda x: x[0], reverse=True)
action = values[0][1]
action.append(self.symbol)
return action
def savePolicy(self):
fw = open('optimal_policy_' + str(self.symbol), 'wb')
pickle.dump(self.estimations, fw)
fw.close()
def loadPolicy(self):
fr = open('optimal_policy_' + str(self.symbol),'rb')
self.estimations = pickle.load(fr)
fr.close()
# human interface
# input a number to put a chessman
# | 1 | 2 | 3 |
# | 4 | 5 | 6 |
# | 7 | 8 | 9 |
class HumanPlayer:
def __init__(self, stepSize = 0.1, exploreRate=0.1):
self.symbol = None
self.currentState = None
return
def reset(self):
return
def setSymbol(self, symbol):
self.symbol = symbol
return
def feedState(self, state):
self.currentState = state
return
def feedReward(self, reward):
return
def takeAction(self):
data = int(input("Input your position:"))
data -= 1
i = data // int(BOARD_COLS)
j = data % BOARD_COLS
if self.currentState.data[i, j] != 0:
return self.takeAction()
return (i, j, self.symbol)
def train(epochs=20000):
player1 = Player()
player2 = Player()
judger = Judger(player1, player2)
player1Win = 0.0
player2Win = 0.0
for i in range(0, epochs):
if i % 5000 == 0:
print("Train Epoch ", i)
winner = judger.play()
if winner == 1:
player1Win += 1
if winner == -1:
player2Win += 1
judger.reset()
print(player1Win / epochs)
print(player2Win / epochs)
player1.savePolicy()
player2.savePolicy()
def compete(turns=2000):
player1 = Player(exploreRate=0)
player2 = Player(exploreRate=0)
judger = Judger(player1, player2, False)
player1.loadPolicy()
player2.loadPolicy()
player1Win = 0.0
player2Win = 0.0
for i in range(0, turns):
if i % 5000 == 0:
print("Train Epoch ", i)
winner = judger.play()
if winner == 1:
player1Win += 1
if winner == -1:
player2Win += 1
judger.reset()
print(player1Win / turns)
print(player2Win / turns)
def play():
while True:
player1 = Player(exploreRate=0)
player2 = HumanPlayer()
judger = Judger(player1, player2, False)
player1.loadPolicy()
winner = judger.play(True)
if winner == player2.symbol:
print("Win!")
elif winner == player1.symbol:
print("Lose!")
else:
print("Tie!")
In [12]:
train()
compete()
In [ ]: